knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = FALSE, fig.show = "asis",
fig.align = "center")
options(scipen=999)
library(sf)
library(ggplot2)
library(tidycensus)
library(tidyverse)
library(ggtext)
library(glue)
library(leaflet)
library(mapview)
library(patchwork)
library(lwgeom)
library(FNN)
library(kableExtra)
library(corrplot)
library(htmltools) # For div() function
library(knitr) # insert images
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
# Change to your own workspace
# knitr::opts_knit$set(root.dir = "E:/Spring/Practicum/DataAnalysis/Chinatown") # Luming
# wd <- "E:/Spring/Practicum/DataAnalysis/Chinatown"
# Aki
knitr::opts_knit$set(root.dir = "~/Documents/MUSA/Spring25/MUSA8010_Practicum/Chinatown-Practicum/")
wd <- "~/Documents/MUSA/Spring25/MUSA8010_Practicum/Chinatown-Practicum/"
nhoods_4326 <-
# st_read("DataWrangling/data/philadelphia-neighborhoods.geojson") %>% # Luming
st_read("data/philadelphia-neighborhoods.geojson") %>% # Aki
st_transform('EPSG:4326')
# philly properties (from Sijia)
# from the last 5 years, inflation-adjusted prices, total area > 100 sq ft, prices > 30,000, outliers above 5 SD from the mean were removed
philly_properties <- st_read("data/property_philly_cleaned.geojson") # Aki
# philly_properties <- st_read("DataWrangling/data/property_philly_cleaned.geojson") # Luming
It is hard to talk about the history of the Philadelphia Chinatown without the construction of I-676, also known as the Vine Street Expressway. Although the highway serves as a significant thoroughfare through Center City Philadelphia, the indispensable repercussion on the adjacent neighborhoods still lingers today. Consequences of the highway such as demolishment of buildings, pedestrian safety threats, traffic congestion, as well as noise and air pollutions had been harming Chinatown and its surrounding neighborhoods over the past decades and are yet to be addressed.
Standing in opposition to the issues, City of Philadelphia Office of Transportation and Infrastructure and Sustainability (OTIS) brought forth the plan of capping sections of the expressway as a solution to address its historical harms on the Chinatown region. This “cap”, named the Chinatown Stitch project, aimed to create a safe and equitable green space for the public that would reconnect areas north and south to the Vine Street expressway. Receiving funds from the US department of Transportation and the local organizations, the construction of the Chinatown Stitch will start in 2027 and is expected to be completed in early 2030.
This purpose of this study is to help OTIS understand the housing market of Philadelphia Chinatown region before and after the implementation of the Chinatown Stitch. The first part of the study will focus the effects of highways on sales prices in the study region as well as the entire Philadelphia. House prices will then be estimated using three predictive models under three different future scenarios: business as usual without the implementation of the Chinatown Stitch, business as usual with the implementation of the Chinatown Stitch, and an alternative future in which the Chinatown Stitch significantly reshapes the urban landscape of the study region.
Philly_blockgroup <- st_read("Dataset/Philly_blockgroup/Philly_blockgroup.shp") %>%
st_transform('EPSG:2272')
# philly bounds
philly_bounds <- st_union(Philly_blockgroup)
studyarea <- st_read("Dataset/studyarea/StudyArea.shp") %>%
st_transform('EPSG:2272')
studyarea_blockgroup <- st_read("Dataset/studyarea/StudyArea_blockgroup_tract.shp") %>%
st_transform('EPSG:2272')
studyarea_north <- st_read("Dataset/studyarea-sub/studyarea_north.shp") %>%
st_transform('EPSG:2272')
studyarea_south <- st_read("Dataset/studyarea-sub/studyarea_south.shp") %>%
st_transform('EPSG:2272')
I_676 <- st_read("Dataset/Highways/I_676.shp") %>%
st_transform('EPSG:2272')
discontinuity <- st_read("Dataset/studyarea-sub/discontinuity.shp") %>%
st_transform('EPSG:2272')
Chinatown_Stitch <- st_read("Dataset/Chinatown_Stitch/Chinatown_Stitch.shp") %>%
st_transform('EPSG:2272')
property_original <- st_read("Dataset/original_property_studyarea.geojson") %>%
st_transform('EPSG:2272')
property_north <- property_original %>%
st_intersection(studyarea_north)
property_south <- property_original %>%
st_intersection(studyarea_south)
# philly hydrology for mapping (bounded by philly_bounds; source: https://opendataphilly.org/datasets/hydrology/)
hydro <- st_read("https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Hydrographic_Features_Poly/FeatureServer/1/query?outFields=*&where=1%3D1&f=geojson") %>%
st_transform(crs = 'EPSG:2272') %>%
st_intersection(philly_bounds)
philly_highways <- st_read("Dataset/Highways/Casestudy_Highways.shp") %>%
st_transform('EPSG:2272')
# cleaned and feature engineered property data for study area
property_studyarea_FE <- st_read("Dataset/property_CT_FE.geojson")
Three study areas are used in the analysis to reveal the effects of I-676 on the land market of the Chinatown region.
StudyArea_plot1 <- ggplot() +
geom_sf(data = Philly_blockgroup, fill = "transparent", color = "grey90", linewidth = .2) +
geom_sf(data = studyarea, fill = "#eb5600", color = "#eb5600") +
geom_sf(data = hydro, fill = "#DFF3E3", color = "transparent") +
geom_sf(data = philly_bounds, fill = "transparent", color = "grey60", linewidth = 1, linetype = "dashed") +
theme_void() +
labs(title = "Three Study Areas:",
subtitle = "Philadelphia | 10 Block Groups around the Chinatown Stitch | North and South Sides",
caption = "Source: ACS Census Data and OPA Data.") +
theme(plot.title = element_text(size = 16, face = "bold",margin = margin(b = 6)),
plot.subtitle = element_text(margin = margin(b = 10)),
plot.caption = element_text(hjust = 0, margin = margin(t = 10)))
StudyArea_plot2 <- ggplot() +
geom_sf(data = studyarea_north, fill = alpha("#eb5600", .1), color = "transparent") +
geom_sf(data = studyarea_south, fill = alpha("#1a9988", .1), color = "transparent") +
# geom_sf(data = discontinuity, fill = "black", color = "transparent") +
geom_sf(data = Chinatown_Stitch, fill = "grey", color = "transparent", linetype = "dashed", linewidth = 1) +
geom_sf(data = studyarea_blockgroup, fill = "transparent", color = "grey", linewidth = .8, linetype = "dashed") +
geom_sf(data = property_north, color = "#eb5600", size = .6) +
geom_sf(data = property_south, color = "#1a9988", size = .6) +
geom_sf(data = studyarea, fill = "transparent", color = "grey60", linewidth = 1.2, linetype = "dashed") +
theme_void()
StudyArea_plot1 | StudyArea_plot2
studyarea_4326 <- st_transform(studyarea, crs = 4326)
bbox <- as.list(st_bbox(studyarea_4326))
ChinatownStitch_4326 <- st_read("Dataset/Chinatown_Stitch/Chinatown_Stitch.shp") %>%
st_transform('EPSG:4326')
I676_4326 <- st_transform(I_676, crs = 4326)
landuse_4326 <- st_read("Dataset/landuse_clip/Land_Use_ClipLayer.shp") %>%
st_transform('EPSG:4326')
landuse_4326_plot <- landuse_4326 %>%
filter(C_DIG2DESC != 51 & C_DIG2DESC != 71)
landuse_park_ing <- landuse_4326 %>%
filter(OBJECTID_1 == 514214)
park_4326 <- st_read("Dataset/ParksModified/Parks_within_1km.shp") %>%
st_transform('EPSG:4326')
# nhoods_4326 <-
# st_read("DataWrangling/data/philadelphia-neighborhoods.geojson") %>%
# # st_read("data/philadelphia-neighborhoods.geojson") %>%
# st_transform('EPSG:4326')
Chinatown_Callowhill <- nhoods_4326 %>%
filter(NAME %in% c("CHINATOWN", "CALLOWHILL"))
In the vision of a city park in the future, the Chinatown Stitch will bring vitality to local residents and visitors along with other green spaces. Connected with the Stitch, the Rail Park in the north (Callowhill neighborhood) is still in construction and will make an impact years later. With a foundation of cultural and commercial resources, the southern part (Chinatown neighborhood) will provide services to the surrounding area, including the city center of Philadelphia.
UseCase <- leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(data = studyarea_4326,
color = "black", # Black border
weight = 4, # Border thickness
dashArray = "8,12", # Dashed line pattern (5px on, 5px off)
fill = FALSE) %>%
addPolygons(data = landuse_4326_plot,
color = "grey",
weight = 0.8,
fillColor = "#DFF3E3",
fillOpacity = 0.4) %>%
addPolygons(data = Chinatown_Callowhill,
color = "#eb5600",
weight = 1,
opacity = 0.5,
dashArray = "5,5",
fillColor = "#eb5600",
fillOpacity = 0.1) %>%
addPolygons(data = park_4326,
color = "#1a9988",
weight = 1,
fillOpacity = 0.6,
fillColor = "#1a9988") %>%
addPolygons(data = landuse_park_ing,
color = "#1a9988",
weight = 2,
opacity = 0.8,
fill = FALSE) %>%
addPolylines(data = I676_4326,
color = "#eb5600",
opacity = 1,
weight = 2) %>%
addPolygons(data = ChinatownStitch_4326,
color = "#eb5600",
weight = 2,
opacity = 1,
fillColor = "#1a9988",
fillOpacity = 0.8
# fill = alpha("#1a9988", 0.5)
) %>%
fitBounds(
lng1 = bbox$xmin,
lat1 = bbox$ymin,
lng2 = bbox$xmax,
lat2 = bbox$ymax
)
div(class = "center-leaflet", UseCase)
|
|
|
| Callowhill | Chinatown |
To achieve the goal of a data-rich story map, the methodology is structured around a robust model construction process. By incorporating illustrative data visualizations throughout the data analysis, the story map features engaging maps and compelling narratives.
TODO: include description of qualitative analysis
# property <- st_read("DataWrangling/data/opa_properties_public.geojson") %>%
# st_transform('EPSG:2272')
#
# property_studyarea <- st_intersection(property, studyarea)
#
# st_write(property_studyarea %>% st_transform('EPSG:3857'), "Dataset/original_property_studyarea.geojson", driver = "GeoJSON")
property_studyarea <- st_read("Dataset/original_property_studyarea.geojson") %>%
st_transform('EPSG: 2272')
landuse <- st_read("Dataset/landuse_clip/Land_Use_ClipLayer.shp") %>%
st_transform('EPSG:2272')
landuse_rename <- landuse %>%
mutate(landuse =
case_when(
C_DIG2DESC == 11 ~ "Residential Low",
C_DIG2DESC == 12 ~ "Residential Medium",
C_DIG2DESC == 13 ~ "Residential High",
C_DIG2DESC == 21 ~ "Commercial Consumer",
C_DIG2DESC == 22 ~ "Commercial Business/Professional",
C_DIG2DESC == 23 ~ "Commercial Mixed Residential",
C_DIG2DESC == 31 ~ "Industrial",
C_DIG2DESC == 41 ~ "Civic/Institution",
C_DIG2DESC == 51 ~ "Transportation",
C_DIG2DESC == 61 ~ "Culture/Amusement",
C_DIG2DESC == 62 ~ "Active Recreation",
C_DIG2DESC == 71 ~ "Park/Open Space",
C_DIG2DESC == 72 ~ "Cemetery",
C_DIG2DESC == 81 ~ "Water",
C_DIG2DESC == 91 ~ "Vacant",
C_DIG2DESC == 92 ~ "Other/Unknown"
))
# philly properties (from Sijia)
# from the last 5 years, inflation-adjusted prices, total area > 100 sq ft, prices > 30,000, outliers above 5 SD from the mean were removed
# philly_properties <- st_read("data/property_philly_250318.geojson")
nhoods <-
st_read("Dataset/DataWrangle/philadelphia-neighborhoods.geojson") %>%
st_transform('EPSG:2272')
park <- st_read("Dataset/ParksModified/Parks_within_1km.shp") %>%
st_transform('EPSG:2272')
# philly park data (https://opendataphilly.org/datasets/ppr-properties/)
philly_parks <- st_read("https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson") %>%
st_transform(crs = 'EPSG:2272') %>%
st_intersection(philly_bounds)
metro <-
st_read("https://opendata.arcgis.com/api/v3/datasets/af52d74b872045d0abb4a6bbbb249453_0/downloads/data?format=geojson&spatialRefId=4326") %>%
st_transform('EPSG:2272') %>%
mutate(Type = "metro")
city_hall <- metro[metro$Station == 'City Hall', 6]
trolley <-
st_read("https://opendata.arcgis.com/api/v3/datasets/dd2afb618d804100867dfe0669383159_0/downloads/data?format=geojson&spatialRefId=4326") %>%
st_transform('EPSG:2272')
trolley_renamed <- trolley %>%
rename(Station = StopName,
Route = LineAbbr,
Longitude = Lon,
Latitude = Lat) %>%
mutate(Type = "trolley")
# Combine both datasets into one
transit <- bind_rows(metro, trolley_renamed)
school <-
st_read("https://opendata.arcgis.com/datasets/d46a7e59e2c246c891fbee778759717e_0.geojson") %>%
st_transform('EPSG:2272')
hospital <-
st_read("Dataset/DataWrangle/DOH_Hospitals202311.geojson") %>%
st_transform('EPSG:2272') %>%
st_filter(st_union(Philly_blockgroup))
water <-
st_read("https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Hydrographic_Features_Poly/FeatureServer/1/query?outFields=*&where=1%3D1&f=geojson") %>%
st_transform('EPSG:2272')
bike_network <-
st_read("Dataset/DataWrangle/Bike_Network.geojson") %>%
st_transform('EPSG:2272')
phillyCrimes <-
st_read("Dataset/DataWrangle/phillyCrimes.geojson") %>%
st_transform('EPSG:2272')
# LPSS_PER1000: Number of low-produce supply stores per 1,000 people
# HPSS_PER1000: Number of high-produce supply stores per 1,000 people
retail <-
st_read("https://opendata.arcgis.com/datasets/53b8a1c653a74c92b2de23a5d7bf04a0_0.geojson") %>%
st_transform('EPSG:2272')
calculate_nearest_distance <- function(set_points, other_layer) {
nearest_idx <- st_nearest_feature(set_points, other_layer)
st_distance(set_points, other_layer[nearest_idx, ], by_element = TRUE) %>% as.numeric()
}
Luming_property <- property_studyarea %>%
dplyr::select(objectid, geometry, sale_price) %>%
rename(orig_objectid = objectid)
Luming_property <- Luming_property %>%
mutate(distance_to_city_hall = st_distance(., city_hall) %>% as.numeric()) %>%
mutate(
distance_to_nearest_transit = calculate_nearest_distance(geometry, transit),
distance_to_nearest_hospital = calculate_nearest_distance(geometry, hospital),
distance_to_nearest_school = calculate_nearest_distance(geometry, school),
distance_to_nearest_park = calculate_nearest_distance(geometry, park),
distance_to_nearest_water = calculate_nearest_distance(geometry, water),
distance_to_nearest_bikelane = calculate_nearest_distance(geometry, bike_network),
distance_to_I676 = calculate_nearest_distance(geometry, I_676)
) %>%
st_join(nhoods["NAME"], left = TRUE) %>%
rename(nhoods_name = NAME) %>%
st_join(landuse_rename["landuse"], left = TRUE) %>%
st_join(retail[, c("LPSS_PER1000", "HPSS_PER1000")], left = TRUE)
Luming_property <- Luming_property %>%
mutate(
crime_nn1 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 1),
crime_nn2 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 2),
crime_nn3 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 3),
crime_nn4 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 4),
crime_nn5 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 5))
property_EDA <- st_read("Dataset/property_sijia_eda.geojson") %>%
st_transform('EPSG:2272')
property_EDA_update <- property_EDA %>%
dplyr::select(-sale_price.y) %>%
rename(sale_price = sale_price.x) %>%
mutate(log_sale_price = ifelse(!is.na(sale_price) & sale_price == 0,
log(sale_price + 1),
log(sale_price)))
property_EDA_update <-
rbind(
property_EDA_update %>% st_intersection(studyarea_north["geometry"]) %>%
mutate(I676 = "north"),
property_EDA_update %>% st_intersection(studyarea_south["geometry"]) %>%
mutate(I676 = "south")
)
# making sure philly_properties has the same columns as property_EDA_update
# this dataset will be used for the Study Area Scale analysis (I-676 effects + Discontinuity effects)
# also, only keep properties within the study area
names_toadd <- names(property_EDA_update)[which(!(names(property_EDA_update) %in% names(philly_properties)))]
properties_i676 <- philly_properties %>%
filter(objectid %in% property_studyarea$objectid) %>% # this limits what was formally 2394 properties in the study area, to only 94 properties..... definitely needs to be checked
mutate(objectid = as.character(objectid)) %>%
left_join(property_EDA_update %>%
select(c(names_toadd, objectid)) %>%
st_drop_geometry(),
by = "objectid")
# checking that this worked
# all(names(property_EDA_update) %in% names(properties_i676)) # TRUE, period!
Before jumping into modeling, we first explored the relationship between distance to highway and property prices. We used property data from OPA Philadelphia that was cleaned to include only the data from the last five years (Jan. 1st, 2020 to Dec. 31st, 2024) with prices adjusted for inflation after removing extreme outliers and improbable values, and keeping properties with total area of over 100 square feet. Looking at properties (of all kinds) within 500 m of Phildelphia’s main highways – I-95 on the East, I-76 on the West, US-1 in the North, and I-676 through the city – we saw the following trends in property prices.
buff500 <- 500 * 3.28084
hiway_500buff <- st_union(st_buffer(philly_highways, buff500))
# adding distance to highway and log price variables
# properties500 <- philly_properties[hiway_500buff,] %>%
# mutate(price_perTLA = adjusted_sale_price / total_livable_area,
# price_perTA = adjusted_sale_price / total_area,
# log_price = log(adjusted_sale_price),
# log_price_perTLA = log(price_perTLA),
# log_price_perTA = log(price_perTA),
# dist_to_US001 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0001")),
# dist_to_I076 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0076")),
# dist_to_I095 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0095")),
# dist_to_I676 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0676")),
# dist_to_US001m = dist_to_US001 * 0.3048,
# dist_to_I076m = dist_to_I076 * 0.3048,
# dist_to_I095m = dist_to_I095 * 0.3048,
# dist_to_I676m = dist_to_I676 * 0.3048)
#
# prop500_closest <- properties500 %>%
# select(matches("objectid|dist_to")) %>%
# pivot_longer(cols = 2:5,
# names_to = "highway",
# values_to = "distance") %>%
# group_by(objectid) %>%
# summarise(dist_to_closest_highway = min(distance, na.rm = T)) %>%
# ungroup() %>%
# mutate(dist_to_closest_highwaym = dist_to_closest_highway * 0.3048)
#
# # join closest highway
# prop500 <- left_join(properties500,
# prop500_closest %>% st_drop_geometry(),
# by = "objectid") %>%
# mutate(closest_highway = case_when(
# dist_to_closest_highway == dist_to_US001 ~ "US-1",
# dist_to_closest_highway == dist_to_I076 ~ "I-76",
# dist_to_closest_highway == dist_to_I095 ~ "I-95",
# dist_to_closest_highway == dist_to_I676 ~ "I-676",
# .default = NA),
# closest_highway = factor(closest_highway, levels = c("US-1", "I-76", "I-676", "I-95")),
# highway_type = case_when(closest_highway %in% c("US-1","I-676") ~ "through neighborhood",
# closest_highway %in% c("I-95","I-76") ~ "along waterway",
# .default = NA))
# save output and read in the prefiltered to save time
# st_write(prop500, "data/philly_prop500_dst2hghwy_250325.geojson")
# read in pre-saved file
prop500 <- st_read("Dataset/PhillyPropertiesNearHighways/philly_prop500_dst2hghwy_250325.geojson") %>%
mutate(closest_highway = factor(closest_highway, levels = c("US-1", "I-76", "I-676", "I-95")))
# map of properties within 500m of highway cored by which highway they're closest to
ggplot() +
geom_sf(data = philly_bounds, fill = "#c9b887", color = "grey") +
geom_sf(data = hydro, fill = "#7d97c7", color = "transparent") +
geom_sf(data = philly_parks, fill = "#6b9169", color = "transparent") +
geom_sf(data = hiway_500buff,
color = "transparent", fill = "grey", alpha = 0.4) +
geom_sf(data = prop500,
aes(color = closest_highway), size = 0.3,
show.legend = F) +
geom_sf(data = philly_highways,
color = "black") +
# add custom colors
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
# labels for highway names
geom_text(aes(label = "US-1",
x = 2700000,
y = 256453.2),
color = "#1a9988") +
geom_text(aes(label = "I-76",
x = 2680000,
y = 235000),
color = "#eb5600") +
geom_text(aes(label = "I-95",
x = 2705200,
y = 235000),
color = "#eba075") +
geom_text(aes(label = "I-676",
x = 2694000,
y = 243000),
color = "#7fe3d6") +
labs(title = "Properties within 500m of Highways in Philly",
subtitle = "US-1, I-76, I-95, and I-676",
color = "Closest Highway") +
theme_void()
prop500_faceted <-
ggplot(data = prop500,
aes(x = log_price_perTA, color = closest_highway, fill = closest_highway)) +
geom_histogram(bins = 50, show.legend = F) +
facet_wrap(~closest_highway, scales = "free_y") +
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
scale_fill_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
labs(title = "Distribution of Property Sale Price",
subtitle = "Split by Closest Highway",
x = "Log Sale Price (per Total Area)",
y = "Count") +
theme_minimal()
prop500_faceted
# median sale prices to talk about in the text
med_price_us1 <- round(median(prop500 %>%
filter(closest_highway == "US-1") %>%
pull(adjusted_sale_price)))
med_price_i76 <- round(median(prop500 %>%
filter(closest_highway == "I-76") %>%
pull(adjusted_sale_price)))
med_price_i95 <- round(median(prop500 %>%
filter(closest_highway == "I-95") %>%
pull(adjusted_sale_price)))
med_price_676 <- round(median(prop500 %>%
filter(closest_highway == "I-676") %>%
pull(adjusted_sale_price)))
We see that the distribution of log sale prices of properties within 500m of highways in Philadelphia generally follow a normal distribution. Though there aren’t as many properties near I-676 as other there are near the other highways, properties are most expensive here with a median property price of $972,362. Properties near I-95 and I-76 are among a similar range of prices with median property prices being $354,750 and $315,203, respectively, though there are almost five times as many properties near I-95 compared to I-76. Finally, there are many properties within 500m of US-1, but the median property price was lowest compared to other highways at $234,847.
# simple scatter of price (as log price per total area in sq ft) and dist2closesthighway
cor_us1 <- round(cor(prop500 %>%
filter(grepl("US", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("US", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i76 <- round(cor(prop500 %>%
filter(grepl("I-76", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-76", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i95 <- round(cor(prop500 %>%
filter(grepl("I-95", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-95", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i676 <- round(cor(prop500 %>%
filter(grepl("I-676", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-676", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
scatter_4highways <-
ggplot(data = prop500,
aes(x = dist_to_closest_highwaym, y = log_price_perTA,
color = closest_highway, fill = closest_highway)) +
geom_point(alpha = .2, size = 0.3, show.legend = F) +
geom_smooth(method = "lm", size = 1.2, show.legend = F) +
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
scale_fill_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
geom_text(aes(label = paste("US-1:", cor_us1),
x = 150,
y = 1.25),
color = "#1a9988",
show.legend = F) +
geom_text(aes(label = paste("I-76:", cor_i76),
x = 350,
y = 1.25),
color = "#eb5600",
show.legend = F) +
geom_text(aes(label = paste("I-95:", cor_i95),
x = 350,
y = 8.75),
color = "#eba075",
show.legend = F) +
geom_text(aes(label = paste("I-676:", cor_i676),
x = 150,
y = 8.75),
color = "#7fe3d6",
show.legend = F) +
labs(title = "Property Price (log price per TA) as a function of Distance to Closest Highway",
subtitle = "Properties within 500m of highways",
x = "Distance to Closest Highway (m)",
y = "Property Sale Price (log price per TA)") +
theme_minimal()
scatter_4highways
From this scatter plot, we see that properties within 500m of I-676 and I-76 follow the expected pattern of increasing property prices as one gets further from the highway (folks want to avoid the noise and air pollution from the highway). The correlation between distance to closest highway (in meters) and Property sale price (log-transformed price per total area) is 0.147 and 0.107 for properties near I-676 and I-76, respectively, meaning for every additional meter away from the highway, property price per square foot of total area increases by 15.8% and 11.3%, respectively.
We see a near null effect for property prices and distance to US-1 (correlation of 0.023), but we see a negative relationship between distance to I-95 and property price (correlation of -0.134). This means that for every additional meter away from the highway, property price per square foot of total area decreases by 12.5%.
park_clipped <- st_intersection(park, studyarea)
park_valid <- st_make_valid(park)
transit_valid <- st_make_valid(transit)
city_hall_valid <- st_make_valid(city_hall)
school_valid <- st_make_valid(school)
water_valid <- st_make_valid(water)
bike_network_valid <- st_make_valid(bike_network)
studyarea_buffer <- st_buffer(studyarea, dist = 1000)
# Clip each dataset to the study area
recreation_clipped <- st_intersection(park, studyarea_buffer)
transit_clipped <- st_intersection(transit_valid, studyarea_buffer)
city_hall_clipped <- st_intersection(city_hall_valid, studyarea_buffer)
school_clipped <- st_intersection(school_valid, studyarea_buffer)
water_clipped <- st_intersection(water_valid, studyarea_buffer)
bike_network_clipped <- st_intersection(bike_network_valid, studyarea_buffer)
landuse_transportation <- landuse_rename[landuse_rename$landuse == "Transportation", ]
property_vacant <- property_EDA %>%
filter(category_code_description %in% c("VACANT LAND","SINGLE FAMILY", "VACANT LAND - NON-RESIDENTIAL")) %>%
mutate(category_code_description = case_when(
category_code_description == "VACANT LAND - NON-RESIDENTIAL" ~ "VACANT LAND",
TRUE ~ category_code_description
))
landuse_colors <- c(
"Residential Low" = "#FFFFB3",
"Residential Medium" = "#FFEB8B",
"Residential High" = "#FFCC80",
"Commercial Consumer" = "#FFB3B3",
"Commercial Business/Professional" = "#FF9980",
"Commercial Mixed Residential" = "#FF6666",
"Industrial" = "#D6A3FF",
"Civic/Institution" = "#66B3FF",
"Transportation" = "gray95",
"Culture/Amusement" = "#C1FFC1",
"Active Recreation" = "#66FFD9",
"Park/Open Space" = "#99FF66",
"Cemetery" = "#FF9999",
"Water" = "#99CCFF",
"Vacant" = "#D3D3D3",
"Other/Unknown" = "gray25"
)
BasicGeo_plot1 <- ggplot(data = landuse_rename %>%
filter(!is.na(landuse) & landuse != "Park/Open Space")) +
geom_sf(aes(fill = landuse), color = NA) +
geom_sf(data = park_clipped, aes(fill = "Park"), color = NA, alpha = 1) + # Clipped park layer
scale_fill_manual(values = c(landuse_colors, "Park" = "#99FF66"), na.value = "white") +
labs(title = "Land Use",
subtitle = "Properties functions in Chinatown",
fill = "Category") +
theme_void() +
theme(legend.position = "right",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 10),
legend.key.size = unit(0.5, "cm"),
legend.text = element_text(size = 8))
BasicGeo_plot2 <- ggplot() +
geom_sf(data = landuse, fill = "white", color = "gray90", size = 0.05) +
geom_sf(data = recreation_clipped, aes(fill = "Park"), color = NA, alpha = 1) +
geom_sf(data = landuse_transportation, aes(fill = "Transportation"), color = NA, alpha = 0.5) +
geom_sf(data = transit_clipped, aes(color = "Transit"), size = 2) +
geom_sf(data = school_clipped, aes(color = "School"), size = 2) +
geom_sf(data = bike_network_clipped, aes(color = "Bike Network"), size = 1) +
scale_color_manual(values = c("Transit" = "blue", "School" = "orange", "Bike Network" = "purple"), name = NULL) +
scale_fill_manual(values = c("Park" = "green","Transportation" = "gray70"), name = NULL) +
labs(
title = "Amenities",
subtitle = "<span style='color:orange;'>Schools</span>, <span style='color:blue;'>Transit Stations</span>, <span style='color:green;'>Parks</span>, and <span style='color:purple;'>Bike Lanes</span>"
) +
theme_void() +
theme(
legend.position = "none",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_markdown(size = 10, hjust = 0),
legend.key.size = unit(0.5, "cm"),
legend.text = element_text(size = 8)
)
BasicGeo_plot3 <- ggplot() +
geom_sf(data = landuse, fill = "white", color = "gray85", size = 0.05) +
geom_sf(data = studyarea, fill = "transparent", color = "grey", linetype = "dashed", linewidth = 2) +
geom_sf(data = discontinuity, fill = "#eb5600", color = "transparent") +
geom_sf(data = Chinatown_Stitch, fill = "#1a9988", alpha = 0.8) +
geom_sf(data = property_vacant, aes(color = category_code_description), size = 1) +
scale_colour_manual(values = c("black", "#B67352")) +
labs(title="Potential Lands",
subtitle = glue("<span style='color:#1a1a1a;'>Single Family & </span><span style='color:#B67352;'>Vacant Land</span>"),
# caption = "Figure "
) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = ggtext::element_markdown(size = 10))
BasicGeo_plot1
BasicGeo_plot2 | BasicGeo_plot3
|
|
|
One of the key deliverables that OTIS wants from us includes an analysis on the effect of I-676 on property prices nearby. Here, we construct a relatively simple linear model with a few key independent variables to look at the effect of I-676 on property prices while accounting for some control variables such as the square footage of the property, number of bathrooms, year built, zoning, and zip code among others.
# keep properties within 500m of I-676
i676_buff <- st_union(st_buffer(philly_highways %>%
filter(ST_RT_NO == "0676"),
buff500))
# actually, working with the study_area properties dataset defined earlier
# add features for simple linear regression
propi676_500 <- prop500[i676_buff,] %>%
# some feature engineering
mutate(residential = ifelse(category_code_description %in% c("APARTMENTS > 4 UNITS",
"MIXED USE",
"MULTI FAMILY",
"SINGLE FAMILY",
"GARAGE - RESIDENTIAL"),
1, 0), # is this property residential? (including mixed)
year_built = as.numeric(year_built),
year_built_group = case_when(
is.na(year_built) | year_built < 1990 ~ "Pre-90s",
year_built >= 1990 & year_built < 2000 ~ "90s",
year_built >= 2000 & year_built < 2010 ~ "00s",
year_built >= 2010 & year_built < 2020 ~ "10s",
year_built >= 2020 ~ "20s"
),
log_dist_i676 = log(dist_to_I676),
central_air = case_when(
central_air %in% c("1", "Y") ~ "Y",
TRUE ~ "N"
),
central_air = factor(central_air),
exterior_condition = case_when(
exterior_condition %in% c("1","2", "3", "4") ~ "AboveAverage",
exterior_condition == "5" ~ "Average",
exterior_condition %in% c("6","8","9") ~ "BelowAverage",
.default = "NAorVacant"
),
exterior_condition = factor(exterior_condition),
interior_condition = case_when(
interior_condition %in% c("1","2","3") ~ "AboveAverage",
interior_condition == "4" ~ "Average",
interior_condition %in% c("5","7") ~ "BelowAverage",
.default = "NAorVacant"
),
interior_condition = factor(interior_condition),
garage_spaces = case_when(
is.na(garage_spaces) | garage_spaces == 0 ~ "0",
garage_spaces == 1 ~ "1",
.default = "2orMore"
),
garage_spaces = factor(garage_spaces),
number_of_bathrooms = case_when(
number_of_bathrooms == 1 ~ "1",
number_of_bathrooms == 2 ~ "2",
number_of_bathrooms > 2 ~ "3orMore",
.default = "0" # NA or 0
),
number_of_bathrooms = factor(number_of_bathrooms),
number_of_bedrooms = case_when(
number_of_bedrooms >= 3 ~ "3orMore",
is.na(number_of_bedrooms) ~ "0",
.default = as.character(number_of_bedrooms)
),
number_of_bedrooms = factor(number_of_bedrooms),
quality_grade_recoded = case_when(
quality_grade %in% c("X", "X-", "X ") ~ "HighestQuality",
quality_grade %in% c("A+", "A", "A ", "A-", "1", "2") ~ "HigherQuality",
quality_grade %in% c("B+", "B", "B ", "B-", "3", "4") ~ "AverageQuality",
quality_grade %in% c("S+", "S", "C+", "5", "6") ~ "LowerQuality",
.default = "LowestQuality" # all lower grades + missing data
),
quality_grade_recoded = factor(quality_grade_recoded),
separate_utilities = factor(separate_utilities),
building_code_description_new = case_when(
grepl("GYM", building_code_description_new) ~ "GYM",
grepl("^WARE", building_code_description_new) ~ "WAREHOUSE",
grepl("COLLEGE|SCHOOL", building_code_description_new) ~ "SCHOOL/COLLEGE/UNI",
grepl("ROW MIXED", building_code_description_new) ~ "ROWHOME MIXED",
grepl("TWIN MIXED", building_code_description_new) ~ "TWIN MIXED",
grepl("^ROW|^TWIN|CONDO|AS RESID", building_code_description_new) ~ "ROW/TWIN/CONDO", # apt blt as resid
grepl("BOARDING|SNGL", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("MEDICAL", building_code_description_new) ~ "MEDICAL OFFICE",
grepl("FRAT", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("LOW RISE", building_code_description_new, ignore.case = T) ~ "LOW RISE BLDG",
grepl("MID RISE", building_code_description_new) ~ "MID RISE APT",
grepl("HIGH RISE", building_code_description_new) ~ "HIGH RISE APT",
grepl("PARKING", building_code_description_new) ~ "PARKING",
grepl("BAR/", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("LEGIT", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("DIALYSI", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("OFFICE WA", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("CONV FOOD", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("MANUFACTU", building_code_description_new) ~ building_code_description_new, # unchanged names
.default = "OTHER" # including missing
),
building_code_description_new = factor(building_code_description_new),
zoning_group = case_when(
zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "Nghbrhd Commercial Mixed",
zoning %in% c("CMX3", "CMX4") ~ "Cmmnty Commercial Mixed",
zoning == "CMX5" ~ "CC Commercial Mixed",
zoning == "RM1" ~ "Resid MultiFam-Big",
zoning == "RM4" ~ "Resid MultiFam-Small",
zoning == "RMX3" ~ "Resid Mixed",
grepl("RSA", zoning) ~ "Resid SnglFam Attached",
zoning == "RTA1" ~ "Resid 2Fam Attached",
zoning == "IRMX" ~ "Indstrl Resid Mixed",
.default = "Other" # including missing
)
) %>%
select( # only keep vars necessary for model
log_price,
dist_to_I676m, log_dist_i676,
residential,
year_built,
central_air, garage_spaces, separate_utilities,
number_of_bathrooms, number_of_bedrooms, total_area,
exterior_condition, interior_condition, quality_grade_recoded,
building_code_description_new, zoning_group,
zip_code
)
# need to check which of residential, building_code_description_new, or zoning is the best in this model
# also need to check if turning some of the numeric variables into categorical is better (year_built may benefit from this)
# map to visualize
# ggplot() +
# geom_sf(data = philly_bounds, fill = "#c9b887", color = "grey") +
# geom_sf(data = hydro, fill = "#7d97c7", color = "transparent") +
# geom_sf(data = philly_parks, fill = "#6b9169", color = "transparent") +
# geom_sf(data = i676_buff,
# color = "transparent", fill = "grey", alpha = 0.4) +
# geom_sf(data = propi676_500, size = 0.3, color = "#7fe3d6",
# show.legend = F) +
# geom_sf(data = philly_highways %>%
# filter(ST_RT_NO == "0676"),
# color = "black") +
# labs(title = "Properties within 500m of I-676") +
# theme_void()
Doing the same thing but only with the study area properties (with valid data after cleaning) –only leaves 94 properties.
# actually, working with the study_area properties dataset defined earlier
# add features for simple linear regression
properties_i676_tomodel <- properties_i676 %>%
# some feature engineering
mutate(log_price = log(adjusted_sale_price),
dist_to_I676m = distance_to_I676 * 0.3048, # convert to meters
log_dist_i676 = log(dist_to_I676m),
residential = ifelse(category_code_description %in% c("APARTMENTS > 4 UNITS",
"MIXED USE",
"MULTI FAMILY",
"SINGLE FAMILY",
"GARAGE - RESIDENTIAL"),
1, 0), # is this property residential? (including mixed)
year_built = as.numeric(year_built),
year_built_group = case_when(
is.na(year_built) | year_built < 1990 ~ "Pre-90s",
year_built >= 1990 & year_built < 2000 ~ "90s",
year_built >= 2000 & year_built < 2010 ~ "00s",
year_built >= 2010 & year_built < 2020 ~ "10s",
year_built >= 2020 ~ "20s"
),
central_air = case_when(
central_air %in% c("1", "Y") ~ "Y",
TRUE ~ "N"
),
central_air = factor(central_air),
exterior_condition = case_when(
exterior_condition %in% c("1","2", "3", "4") ~ "AboveAverage",
exterior_condition == "5" ~ "Average",
exterior_condition %in% c("6","8","9") ~ "BelowAverage",
.default = "NAorVacant"
),
exterior_condition = factor(exterior_condition),
interior_condition = case_when(
interior_condition %in% c("1","2","3") ~ "AboveAverage",
interior_condition == "4" ~ "Average",
interior_condition %in% c("5","7") ~ "BelowAverage",
.default = "NAorVacant"
),
interior_condition = factor(interior_condition),
garage_spaces = case_when(
is.na(garage_spaces) | garage_spaces == 0 ~ "0",
garage_spaces == 1 ~ "1",
.default = "2orMore"
),
garage_spaces = factor(garage_spaces),
number_of_bathrooms = case_when(
number_of_bathrooms == 1 ~ "1",
number_of_bathrooms == 2 ~ "2",
number_of_bathrooms > 2 ~ "3orMore",
.default = "0" # NA or 0
),
number_of_bathrooms = factor(number_of_bathrooms),
number_of_bedrooms = case_when(
number_of_bedrooms >= 3 ~ "3orMore",
is.na(number_of_bedrooms) ~ "0",
.default = as.character(number_of_bedrooms)
),
number_of_bedrooms = factor(number_of_bedrooms),
quality_grade_recoded = case_when(
quality_grade %in% c("X", "X-", "X ") ~ "HighestQuality",
quality_grade %in% c("A+", "A", "A ", "A-", "1", "2") ~ "HigherQuality",
quality_grade %in% c("B+", "B", "B ", "B-", "3", "4") ~ "AverageQuality",
quality_grade %in% c("S+", "S", "C+", "5", "6") ~ "LowerQuality",
.default = "LowestQuality" # all lower grades + missing data
),
quality_grade_recoded = factor(quality_grade_recoded),
separate_utilities = factor(separate_utilities),
building_code_description_new = case_when(
grepl("GYM", building_code_description_new) ~ "GYM",
grepl("^WARE", building_code_description_new) ~ "WAREHOUSE",
grepl("COLLEGE|SCHOOL", building_code_description_new) ~ "SCHOOL/COLLEGE/UNI",
grepl("ROW MIXED", building_code_description_new) ~ "ROWHOME MIXED",
grepl("TWIN MIXED", building_code_description_new) ~ "TWIN MIXED",
grepl("^ROW|^TWIN|CONDO|AS RESID", building_code_description_new) ~ "ROW/TWIN/CONDO", # apt blt as resid
grepl("BOARDING|SNGL", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("MEDICAL", building_code_description_new) ~ "MEDICAL OFFICE",
grepl("FRAT", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("LOW RISE", building_code_description_new, ignore.case = T) ~ "LOW RISE BLDG",
grepl("MID RISE", building_code_description_new) ~ "MID RISE APT",
grepl("HIGH RISE", building_code_description_new) ~ "HIGH RISE APT",
grepl("PARKING", building_code_description_new) ~ "PARKING",
grepl("BAR/", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("LEGIT", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("DIALYSI", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("OFFICE WA", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("CONV FOOD", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("MANUFACTU", building_code_description_new) ~ building_code_description_new, # unchanged names
.default = "OTHER" # including missing
),
building_code_description_new = factor(building_code_description_new),
zoning_group = case_when(
zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "Nghbrhd Commercial Mixed",
zoning %in% c("CMX3", "CMX4") ~ "Cmmnty Commercial Mixed",
zoning == "CMX5" ~ "CC Commercial Mixed",
zoning == "RM1" ~ "Resid MultiFam-Big",
zoning == "RM4" ~ "Resid MultiFam-Small",
zoning == "RMX3" ~ "Resid Mixed",
grepl("RSA", zoning) ~ "Resid SnglFam Attached",
zoning == "RTA1" ~ "Resid 2Fam Attached",
zoning == "IRMX" ~ "Indstrl Resid Mixed",
.default = "Other" # including missing
)
) %>%
select( # only keep vars necessary for model
log_price,
dist_to_I676m, log_dist_i676,
residential,
year_built,
central_air, garage_spaces, separate_utilities,
number_of_bathrooms, number_of_bedrooms, total_area,
exterior_condition, interior_condition, quality_grade_recoded,
building_code_description_new, zoning_group,
zip_code
)
Doing the same thing but only with the newest study area properties (with valid data after cleaning, focusing on the study area, not the cleaned data of all of Philly properties) –only leaves 94 properties.
# working with the newest philly_properties dataset where data was cleaned for our studyarea
property_studyarea_new <- st_read("Dataset/property_final.geojson")
## Reading layer `property_final' from data source
## `/Users/akiradisandro/Documents/MUSA/Spring25/MUSA8010_Practicum/Chinatown-Practicum/Dataset/property_final.geojson'
## using driver `GeoJSON'
## Simple feature collection with 1637 features and 105 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 2692396 ymin: 236562 xmax: 2697661 ymax: 240010.2
## Projected CRS: NAD83 / Pennsylvania South (ftUS)
# add features for simple linear regression
props_i676_tomodel <- property_studyarea_new %>%
# filter() %>% # there are a couple properties with total area < 100; reduces dataset to 990 properties
# some feature engineering
mutate(log_price = log(adj_sale_price),
dist_to_I676m = distance_to_I676 * 0.3048, # convert to meters
log_dist_i676 = log(dist_to_I676m),
residential = ifelse(category_code_description %in% c("APARTMENTS > 4 UNITS",
"MIXED USE",
"MULTI FAMILY",
"SINGLE FAMILY",
"GARAGE - RESIDENTIAL"),
1, 0), # is this property residential? (including mixed)
# all the other variables have been cleaned and feature engineered
) %>%
select( # only keep vars necessary for model
log_price,
dist_to_I676m, log_dist_i676,
residential,
year_built,
central_air, garage_spaces, separate_utilities,
number_of_bathrooms, number_of_bedrooms, total_area,
exterior_condition, interior_condition, quality_grade,
building_code_description_new, zoning,
zip_code,
I676_NS
)
# Aki 4/8/25 --- code is breaking because the property_studyarea_new used to read in data from property_CT.geojson, but the file structure is different from the new property_final.geojson
# Numeric variables
# number_of_rooms was previously included but all but 10 values are NA
# propi676_500_numvars <- propi676_500 %>%
# select(log_price, dist_to_I676m,
# # depth, fireplaces, frontage, garage_spaces, number_stories, # ended up excluding
# number_of_bathrooms,
# number_of_bedrooms, total_area, year_built) %>%
# mutate(year_built = as.numeric(year_built)) %>%
# st_drop_geometry()
#
# cors_logprice <- cor(propi676_500_numvars,
# use = "pairwise.complete.obs")
#
# # significance test
# # sigtest_logprice <- cor.mtest(propi676_500_numvars)
#
# # pretty correlation matrix of all continuous variables
# corrplot(cors_logprice,
# title = "Correlation Matrix for Log Price",
# # p.mat = sigtest_logprice$p,
# method = 'circle',
# type = 'lower',
# insig='blank',
# addCoef.col ='black',
# number.cex = 0.8,
# diag=FALSE)
# Based off of this correlation matrix, I will move forward with number_of_bathrooms (instead of number_of_bedrooms) since it has a higher correlation with log_price
# total_area will be included (and as a result, I won't keep frontage, depth, and number_of_stories because of multicollinearity)
# Categorical variables
## Basements
# not including because there isn't enough of a difference in log price across categories
# unique(propi676_500$basements)
# table(propi676_500$basements)
# summary(propi676_500$basements)
#
# temp <- propi676_500 %>%
# mutate(basement_finish = case_when(basements %in% c("A", "B", "E", "F") ~ "Finished", # including finished and semi-finished
# basements %in% c("C", "G", "J") ~ "Unfinished",
# basements == "0" ~ "No Basement",
# .default = "Unknown Finish"),
# basement_size = case_when(basements %in% c("A", "B", "C", "D") ~ "Full", # including finished and semi-finished
# basements %in% c("E", "F", "G", "H") ~ "Partial",
# basements == "0" ~ "No Basement",
# .default = "Unknown Size"))
#
# ggplot(data = temp %>%
# group_by(basement_finish) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = basement_finish, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) for Different Basement Finishes",
# x = "Basement Finishes",
# y = "Mean Price")
#
# ggplot(data = temp %>%
# group_by(basement_size) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = basement_size, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) for Different Basement Size",
# x = "Basement Size",
# y = "Mean Price")
## Category Code
# only including whether a property is residential or not
# unique(propi676_500$category_code)
# table(propi676_500$category_code)
# summary(propi676_500$category_code)
#
# temp <- propi676_500 %>%
# mutate(category_simple = case_when(category_code_description %in% c("APARTMENTS > 4 UNITS",
# "MIXED USE",
# "MULTI FAMILY",
# "SINGLE FAMILY") ~ "Residential or Mixed",
# grepl("INDUS", category_code_description) ~ "Industrial",
# grepl("HOTEL", category_code_description) ~ "Hotel",
# grepl("OFFICES", category_code_description) ~ "Offices",
# category_code_description %in% c("COMMERCIAL",
# "GARAGE - COMMERCIAL",
# "RETAIL") ~ "Commercial",
# grepl("VACANT", category_code_description) ~ "Vacant",
# .default = "Other"),
# residential = ifelse(category_simple == "Residential or Mixed", 1, 0))
# ggplot(data = temp %>%
# group_by(category_simple) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = category_simple, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Category",
# x = "Property Category",
# y = "Mean Price")
# ggplot(data = temp %>%
# group_by(residential) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = residential, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Category (Residential or not)",
# x = "Property Category",
# y = "Mean Price")
## Central Air
# not including because there isn't enough of a difference in log price across categories
# unique(propi676_500$central_air)
# table(propi676_500$central_air)
# summary(propi676_500$central_air)
#
# temp <- propi676_500 %>%
# mutate(central_air_new = case_when(central_air == "Y" ~ central_air,
# .default = "N or unknown"))
#
# ggplot(data = temp %>%
# group_by(central_air_new) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = central_air_new, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Central Air",
# x = "Central Air",
# y = "Mean Price")
# parcel_shape
# three categories (A,B,E) seem to be good enough
# propi676_500 %>%
# select(parcel_shape, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(parcel_shape) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = parcel_shape, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# separate_utilities
# four categories (A,B,C,NA) seem to be good enough
# propi676_500 %>%
# select(separate_utilities, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(separate_utilities) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = separate_utilities, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# fireplaces
# not a huge factor, so not including in model for now
# but if i were to use this, i think i would just group this into a binary indicating the presence of a fireplace on a property
# unique(propi676_500$fireplaces)
# table(propi676_500$fireplaces)
# summary(propi676_500$fireplaces)
#
# propi676_500 %>%
# select(fireplaces, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(fireplaces) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = fireplaces, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# i realized halfway through doing this that I could use what Sijia started with
getRMSE <- function(model_results) {
# residual sum of squares
rss <- c(crossprod(model_results$residuals))
# Mean squared error:
mse <- rss / length(model_results$residuals)
# Root MSE:
RMSE <- sqrt(mse)
return(RMSE)
}
# model with just distance to highway
model_i676 <- lm(log_price ~ dist_to_I676m,
data = propi676_500)
model_i676_summary <- summary(model_i676)
model_i676_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m, data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9761 -0.5219 -0.1135 0.4409 3.0335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.3743509 0.1260092 114.074 < 0.0000000000000002 ***
## dist_to_I676m -0.0019135 0.0003805 -5.029 0.000000789 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9929 on 351 degrees of freedom
## Multiple R-squared: 0.0672, Adjusted R-squared: 0.06455
## F-statistic: 25.29 on 1 and 351 DF, p-value: 0.0000007891
model_i676_RMSE <- getRMSE(model_i676)
From a overly simple model, just looking at how distance to I676 (m) predicts property price (log transformed), we see that for properties within 500m of I676, distance from the highway does not have a large effect on property price. We see that for every additional meter away from I676, property price decreases by 0.2% (basically a null effect). The R squared value of the model is very low at 0.067 and the RMSE is very high 0.99, meaning the model is terrible.
# model with just distance to highway + zip code
model_i676_zip <- lm(log_price ~ dist_to_I676m + zip_code,
data = propi676_500)
model_i676_zip_summary <- summary(model_i676_zip)
model_i676_zip_RMSE <- getRMSE(model_i676_zip)
model_i676_zip_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + zip_code, data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4559 -0.5364 -0.0900 0.3883 3.2469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.286503 0.380829 40.140 < 0.0000000000000002 ***
## dist_to_I676m -0.001763 0.000441 -3.998 0.000078 ***
## zip_code19103 -0.870646 0.391550 -2.224 0.026823 *
## zip_code19104 -1.496935 0.414420 -3.612 0.000349 ***
## zip_code19106 -0.969805 0.395024 -2.455 0.014579 *
## zip_code19107 -1.445813 0.403189 -3.586 0.000384 ***
## zip_code19123 -0.315026 0.401933 -0.784 0.433708
## zip_code19130 -1.020486 0.438004 -2.330 0.020390 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9194 on 345 degrees of freedom
## Multiple R-squared: 0.2138, Adjusted R-squared: 0.1979
## F-statistic: 13.41 on 7 and 345 DF, p-value: 0.00000000000000254
When we add ZIP code to this simple model, the adjusted R squared increases to 0.198 and RMSE decreases to 0.909, indicating that we have a slightly better, but still bad model. In this ZIP + distance to I676 model, keeping all else constant property price still decreases only by 0.2% per every additional meter away from the expressway.
# simple model but with more predictors
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade_recoded +
factor(zip_code), # neighborhood effects (via zip code for now)
data = propi676_500)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
model_i676_more_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + residential + year_built +
## central_air + number_of_bathrooms + total_area + quality_grade_recoded +
## factor(zip_code), data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42503 -0.30862 -0.00947 0.30665 2.61892
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 14.413422595 0.499280750 28.868
## dist_to_I676m -0.000484338 0.000361651 -1.339
## residential -0.650732451 0.152974130 -4.254
## year_built 0.000168566 0.000202783 0.831
## central_airY 0.039468084 0.128035661 0.308
## number_of_bathrooms1 -0.568288121 0.137764180 -4.125
## number_of_bathrooms2 -0.112000409 0.144022500 -0.778
## number_of_bathrooms3orMore 0.197076160 0.149153688 1.321
## total_area 0.000051908 0.000005649 9.189
## quality_grade_recodedHigherQuality 0.242072835 0.201031136 1.204
## quality_grade_recodedHighestQuality 1.262089310 0.490235277 2.574
## quality_grade_recodedLowerQuality -0.038399167 0.126012426 -0.305
## quality_grade_recodedLowestQuality 0.212226594 0.094495872 2.246
## factor(zip_code)19103 -0.215029069 0.299579671 -0.718
## factor(zip_code)19104 -0.986370091 0.313695835 -3.144
## factor(zip_code)19106 -0.436644503 0.301382009 -1.449
## factor(zip_code)19107 -0.565890700 0.308259210 -1.836
## factor(zip_code)19123 -0.330145337 0.307755414 -1.073
## factor(zip_code)19130 -0.400066655 0.337101622 -1.187
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## dist_to_I676m 0.18147
## residential 0.0000278 ***
## year_built 0.40646
## central_airY 0.75809
## number_of_bathrooms1 0.0000476 ***
## number_of_bathrooms2 0.43736
## number_of_bathrooms3orMore 0.18737
## total_area < 0.0000000000000002 ***
## quality_grade_recodedHigherQuality 0.22944
## quality_grade_recodedHighestQuality 0.01050 *
## quality_grade_recodedLowerQuality 0.76078
## quality_grade_recodedLowestQuality 0.02541 *
## factor(zip_code)19103 0.47344
## factor(zip_code)19104 0.00183 **
## factor(zip_code)19106 0.14840
## factor(zip_code)19107 0.06735 .
## factor(zip_code)19123 0.28421
## factor(zip_code)19130 0.23622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6759 on 311 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.5654, Adjusted R-squared: 0.5402
## F-statistic: 22.47 on 18 and 311 DF, p-value: < 0.00000000000000022
Finally, we look at a more robust model (which is still relatively simple, including only internal features of the property in addition to ZIP code and distance to I676). This model includes the following predictors: year_built (categorical), quality_grade (categorical), number_of_bathrooms (categorical), total_area (continuous), central_air (binary), and a variable dictating whether the property is residential or not (binary). This more complex model does a better job at explaining the variance in property price than our previous two models, but is not optimal either. The adjusted R squared for this model is 0.54 and our RMSE is 0.656. When including these other predictors, the effect of distance to highway is not only much smaller than before, but is now also not statistically significant.
# model with just distance to highway
model_i676 <- lm(log_price ~ dist_to_I676m,
data = properties_i676_tomodel)
model_i676_summary <- summary(model_i676)
model_i676_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m, data = properties_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8062 -0.7753 -0.2439 0.5691 2.8526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.1060477 0.2271245 62.107 <0.0000000000000002 ***
## dist_to_I676m -0.0008128 0.0006813 -1.193 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 92 degrees of freedom
## Multiple R-squared: 0.01524, Adjusted R-squared: 0.004532
## F-statistic: 1.423 on 1 and 92 DF, p-value: 0.2359
model_i676_RMSE <- getRMSE(model_i676)
From a overly simple model, just looking at how distance to I676 (m) predicts property price (log transformed), we see that for properties within 500m of I676, distance from the highway does not have a large effect on property price. We see that for every additional meter away from I676, property price decreases by 0.1% (basically a null effect). The R squared value of the model is very low at 0.015 and the RMSE is very high 1.226, meaning the model is terrible.
# model with just distance to highway + zip code
model_i676_zip <- lm(log_price ~ dist_to_I676m + zip_code,
data = properties_i676_tomodel)
model_i676_zip_summary <- summary(model_i676_zip)
model_i676_zip_RMSE <- getRMSE(model_i676_zip)
model_i676_zip_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + zip_code, data = properties_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3947 -0.6546 -0.2687 0.7249 3.5057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.3988283 0.4841292 31.807 < 0.0000000000000002 ***
## dist_to_I676m -0.0025335 0.0008977 -2.822 0.00588 **
## zip_code19107 -1.5509816 0.5026468 -3.086 0.00271 **
## zip_code19123 -0.4090618 0.5439376 -0.752 0.45401
## zip_code19130 0.8887256 1.2331552 0.721 0.47299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.142 on 89 degrees of freedom
## Multiple R-squared: 0.1919, Adjusted R-squared: 0.1556
## F-statistic: 5.285 on 4 and 89 DF, p-value: 0.0007258
When we add ZIP code to this simple model, the adjusted R squared increases to 0.156 and RMSE decreases to 1.111, indicating that we have a slightly better, but still bad model. In this ZIP + distance to I676 model, keeping all else constant property price still decreases only by 0.3% per every additional meter away from the expressway.
# simple model but with more predictors
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade_recoded +
factor(zip_code), # neighborhood effects (via zip code for now)
data = properties_i676_tomodel)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
model_i676_more_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + residential + year_built +
## central_air + number_of_bathrooms + total_area + quality_grade_recoded +
## factor(zip_code), data = properties_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81787 -0.43298 -0.02321 0.34939 2.54948
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 14.44161585 0.75679174 19.083
## dist_to_I676m -0.00051853 0.00081146 -0.639
## residential -0.86338283 0.34115500 -2.531
## year_built 0.00042748 0.00030489 1.402
## central_airY -0.17507996 0.26337123 -0.665
## number_of_bathrooms1 -0.90444685 0.26897009 -3.363
## number_of_bathrooms2 -0.33597326 0.38293364 -0.877
## number_of_bathrooms3orMore -0.33348244 0.38756125 -0.860
## total_area 0.00003892 0.00001422 2.737
## quality_grade_recodedHigherQuality -0.26420112 0.47300566 -0.559
## quality_grade_recodedLowerQuality -0.39350994 0.38362580 -1.026
## quality_grade_recodedLowestQuality 0.03199143 0.26352393 0.121
## factor(zip_code)19107 -0.59911385 0.37180624 -1.611
## factor(zip_code)19123 -0.14641653 0.41875916 -0.350
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## dist_to_I676m 0.52503
## residential 0.01377 *
## year_built 0.16558
## central_airY 0.50852
## number_of_bathrooms1 0.00129 **
## number_of_bathrooms2 0.38347
## number_of_bathrooms3orMore 0.39265
## total_area 0.00796 **
## quality_grade_recodedHigherQuality 0.57835
## quality_grade_recodedLowerQuality 0.30875
## quality_grade_recodedLowestQuality 0.90374
## factor(zip_code)19107 0.11187
## factor(zip_code)19123 0.72772
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7689 on 66 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.5883
## F-statistic: 9.684 on 13 and 66 DF, p-value: 0.00000000007079
Finally, we look at a more robust model (which is still relatively simple, including only internal features of the property in addition to ZIP code and distance to I676). This model includes the following predictors: year_built (categorical), quality_grade (categorical), number_of_bathrooms (categorical), total_area (continuous), central_air (binary), and a variable dictating whether the property is residential or not (binary). This more complex model does a better job at explaining the variance in property price than our previous two models, but is not optimal either. The adjusted R squared for this model is 0.588 and our RMSE is 0.698. When including these other predictors, the effect of distance to highway is not only much smaller than before, but is now also not statistically significant.
# model with just distance to highway
model_i676 <- lm(log_price ~ dist_to_I676m,
data = props_i676_tomodel)
model_i676_summary <- summary(model_i676)
model_i676_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m, data = props_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8625 -0.5520 -0.1526 0.2890 3.9307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.9757746 0.0454682 285.382 <0.0000000000000002 ***
## dist_to_I676m 0.0003638 0.0001546 2.353 0.0188 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9687 on 1635 degrees of freedom
## Multiple R-squared: 0.003374, Adjusted R-squared: 0.002764
## F-statistic: 5.534 on 1 and 1635 DF, p-value: 0.01876
model_i676_RMSE <- getRMSE(model_i676)
From a overly simple model, just looking at how distance to I676 (m) predicts property price (log transformed), we see that for properties within 500m of I676, distance from the highway does not have a large effect on property price. We see that for every additional meter away from I676, property price decreases by 0% (basically a null effect). The R squared value of the model is very low at 0.003 and the RMSE is very high 0.968, meaning the model is terrible.
# model with just distance to highway + zip code
model_i676_zip <- lm(log_price ~ dist_to_I676m + zip_code,
data = props_i676_tomodel)
model_i676_zip_summary <- summary(model_i676_zip)
model_i676_zip_RMSE <- getRMSE(model_i676_zip)
model_i676_zip_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + zip_code, data = props_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8561 -0.5392 -0.1402 0.2891 3.9379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.8158471 0.1621570 85.200 < 0.0000000000000002 ***
## dist_to_I676m 0.0005802 0.0002109 2.751 0.00601 **
## zip_code19103 0.7278516 0.9629812 0.756 0.44986
## zip_code19106 -0.7719697 0.1817742 -4.247 0.00002290337 ***
## zip_code19107 -0.9083112 0.1596578 -5.689 0.00000001511 ***
## zip_code19108 1.6930555 0.9624609 1.759 0.07875 .
## zip_code19123 -0.9744245 0.1651721 -5.899 0.00000000443 ***
## zip_code19130 1.0322823 0.4180299 2.469 0.01364 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9497 on 1629 degrees of freedom
## Multiple R-squared: 0.04564, Adjusted R-squared: 0.04154
## F-statistic: 11.13 on 7 and 1629 DF, p-value: 0.00000000000008137
When we add ZIP code to this simple model, the adjusted R squared increases to 0.042 and RMSE decreases to 0.947, indicating that we have a slightly better, but still bad model. In this ZIP + distance to I676 model, keeping all else constant property price still decreases only by 0.1% per every additional meter away from the expressway.
# simple model but with more predictors
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade +
factor(zip_code), # neighborhood effects (via zip code for now)
data = props_i676_tomodel)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
model_i676_more_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + residential + year_built +
## central_air + number_of_bathrooms + total_area + quality_grade +
## factor(zip_code), data = props_i676_tomodel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1942 -0.3843 -0.0473 0.3397 1.8767
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.16629030 1.50467617 6.756 0.0000000000383 ***
## dist_to_I676m -0.00020242 0.00030114 -0.672 0.50177
## residential NA NA NA NA
## year_built 0.00186300 0.00071154 2.618 0.00910 **
## central_air1 -0.03835507 0.75758257 -0.051 0.95964
## central_airN 0.42238405 0.75760858 0.558 0.57741
## central_airY 0.56863417 0.76630951 0.742 0.45840
## number_of_bathrooms 0.03219029 0.03880960 0.829 0.40724
## total_area 0.00009833 0.00003165 3.106 0.00200 **
## quality_gradeA- -0.08325870 0.53966656 -0.154 0.87745
## quality_gradeB -0.69513925 0.44379648 -1.566 0.11788
## quality_gradeB- -0.90727116 0.45519723 -1.993 0.04677 *
## quality_gradeB+ -0.92444444 0.45435938 -2.035 0.04240 *
## quality_gradeC -1.17251638 0.44555253 -2.632 0.00875 **
## quality_gradeC- -0.10463517 0.76756756 -0.136 0.89162
## quality_gradeC+ -0.94065757 0.45360771 -2.074 0.03860 *
## quality_gradeE+ -0.34693449 0.76297528 -0.455 0.64951
## factor(zip_code)19106 -0.64946193 0.66575001 -0.976 0.32975
## factor(zip_code)19107 -0.94452223 0.62872501 -1.502 0.13364
## factor(zip_code)19123 -0.64402793 0.63170656 -1.020 0.30844
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6183 on 515 degrees of freedom
## (1103 observations deleted due to missingness)
## Multiple R-squared: 0.1719, Adjusted R-squared: 0.1429
## F-statistic: 5.939 on 18 and 515 DF, p-value: 0.0000000000003974
Finally, we look at a more robust model (which is still relatively simple, including only internal features of the property in addition to ZIP code and distance to I676). This model includes the following predictors: year_built (categorical), quality_grade (categorical), number_of_bathrooms (categorical), total_area (continuous), central_air (binary), and a variable dictating whether the property is residential or not (binary). This more complex model does a better job at explaining the variance in property price than our previous two models, but is not optimal either. The adjusted R squared for this model is 0.143 and our RMSE is 0.607. When including these other predictors, the effect of distance to highway is not only much smaller than before, but is now also not statistically significant.
We move forward with our analyses with the hypothesis that although distance to I676 does not play as much of a role in predicting property price, the Chinatown Stitch project will create an open green space that we can conceptualize as a park. We predict that the Chinatown Stitch cap as a park will have a statistically significant effect on property prices nearby.
# property_discontinutiy <- property_EDA_update %>%
# mutate(distance_to_highway = case_when(
# I676 == "north" ~ distance_to_I676,
# I676 == "south" ~ -distance_to_I676
# ))
#
# discontinuity_plot <- property_discontinutiy %>%
# st_drop_geometry() %>%
# filter(sale_price > 10 & sale_price < 1e6) %>%
# group_by(distance_to_highway, I676) %>%
# summarise(log_sale_price = mean(log_sale_price, na.rm = TRUE), .groups = "drop")
# with updated data (only 94 properties)
# discontinuity_plot <- properties_i676 %>%
# st_drop_geometry() %>%
# mutate(dist_to_I676m = distance_to_I676 * 0.3048,
# distance_to_highway = case_when(
# I676 == "north" ~ dist_to_I676m,
# I676 == "south" ~ -dist_to_I676m
# ),
# log_price = log(adjusted_sale_price)) %>%
# group_by(distance_to_highway, I676) %>%
# summarise(mean_log_price = mean(log_price, na.rm = TRUE), .groups = "drop")
# with most recently updated data (990 properties)
discontinuity_plot <- props_i676_tomodel %>%
st_drop_geometry() %>%
mutate(distance_to_highway = case_when(
I676_NS == "north" ~ dist_to_I676m,
I676_NS == "south" ~ -dist_to_I676m
)) %>%
group_by(distance_to_highway, I676_NS) %>%
summarise(mean_log_price = mean(log_price, na.rm = TRUE), .groups = "drop")
# correlations to add as annotations on the plot
cor_north <- round(cor(discontinuity_plot %>%
filter(
abs(distance_to_highway) <= 500,
I676_NS == "north") %>%
pull(mean_log_price),
discontinuity_plot %>%
filter(
abs(distance_to_highway) <= 500,
I676_NS == "north") %>%
pull(distance_to_highway)), 3)
cor_south <- round(cor(discontinuity_plot %>%
filter(
abs(distance_to_highway) <= 500,
I676_NS == "south") %>%
pull(mean_log_price),
discontinuity_plot %>%
filter(
abs(distance_to_highway) <= 500,
I676_NS == "south") %>%
pull(distance_to_highway)), 3)
discontinuity_plot %>%
filter(abs(distance_to_highway) <= 500) %>% # limiting it to the properties within 500m
ggplot(aes(x = distance_to_highway, y = mean_log_price, color = I676_NS)) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", se = FALSE, size = 1.2) +
geom_vline(xintercept = 0, color = "black", size = 2) +
scale_colour_manual(values = c("#eb5600", "#1a9988")) +
theme_minimal() +
theme(legend.position = "None",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = ggtext::element_markdown(size = 12),
plot.caption = element_text(hjust = 0)) +
labs(title = 'Logged Sale Price as a Function of Distance to I-676',
subtitle = glue("<span style='color:#1a9988;'>South Side</span> vs. <span style='color:#eb5600;'>North Side</span>"),
x = "Distance to the Highyway (m)",
y = "Log-transformed Sale Price") +
# add annotations
geom_label(aes(x = 0, y = 8, label = "I-676"),
fill = "black", color = "white", fontface = "bold", size = 5) +
geom_text(aes(label = cor_north,
x = 250,
y = 11.825),
color = "#eb5600",
show.legend = F) +
geom_text(aes(label = cor_south,
x = -250,
y = 11.825),
color = "#1a9988",
show.legend = F)
# save image for powerpoint
# ggsave("PPT/Images/p2_discontinuity_within_500m.png",
# height = 7, width = 10, units = "in")
# same discontinuity plot but with the lines reflecting the simple model we made in the section above
# Aki - tbh idk if this is feasible....but it also might just be becasue i'm running on fumes rn....let's check back in after the second presentation
# below is code suggestions from ChatGPT but it needs to be tweaked.
# Create a new data frame with the predictor variable
# prediction_data <- data.frame(dist_to_I676m = discontinuity_plot$distance_to_highway)
#
# # Predict values using the fitted model
# prediction_data$predicted_log_price <- predict(model_i676_more, newdata = prediction_data)
#
# ggplot(discontinuity_plot, aes(x = distance_to_highway, y = mean_log_price, color = I676)) +
# geom_point(size = 0.5) +
# geom_line(data = prediction_data, aes(x = dist_to_I676m, y = predicted_log_price),
# color = "black", size = 1.2) + # Use geom_line for custom model fit
# geom_vline(xintercept = 0, color = "black", size = 2) +
# scale_colour_manual(values = c("#eb5600", "#1a9988")) +
# theme_minimal() +
# theme(legend.position = "None",
# plot.title = element_text(size = 16, face = "bold"),
# plot.subtitle = ggtext::element_markdown(size = 12),
# plot.caption = element_text(hjust = 0)) +
# labs(title = 'Logged Sale Price as a Function of Distance to I-676',
# subtitle = glue("<span style='color:#1a9988;'>South Side</span> vs. <span style='color:#eb5600;'>North Side</span>"),
# caption = "Lines represent Model 3 from our previous slide (Highway + Spatial + Internal Characteristic Effects)",
# x = "Distance to the Highway (m) ",
# y = "Log-transformed Sale Price") +
# geom_label(aes(x = 0, y = 8, label = "I-676"),
# fill = "black", color = "white", fontface = "bold", size = 5)
# save image for powerpoint
# ggsave("PPT/Images/p2_discontinuity_modellines.png",
# height = 7, width = 10, units = "in")
2.4 Social Survey